R Markdown

Task 1:Load and clean FacultySalaries_1995.csv file and Re-create the graph below…

View of the clean data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(stringr)
library(ggpubr)
library(broom)
library(AICcmodavg)
library(knitr)
library(DT)
df<- read_csv("Data/FacultySalaries_1995.csv")%>% clean_names() %>%   
  pivot_longer(c(ends_with("salary")), names_to = "rank", values_to = "salary",names_prefix = "avg_") %>%
  pivot_longer(c(ends_with("comp")), names_to = "comp_type", values_to = "comp_amt") %>%
  pivot_longer(c(num_full_profs,num_assoc_profs,num_assist_profs), names_to = "faculty_type", values_to = "faculty_count") 
## Rows: 1161 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): UnivName, State, Tier
## dbl (14): FedID, AvgFullProfSalary, AvgAssocProfSalary, AvgAssistProfSalary,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df %>% filter(!df$tier == "VIIB")

datatable(df, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Graph

df %>%
  ggplot(aes(x=rank, y=salary,fill=rank))+
  geom_boxplot()+
  facet_wrap(~tier)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90))  
## Warning: Removed 1152 rows containing non-finite values (`stat_boxplot()`).

TASK 2: ANOVA

twoway<-aov(salary~state+rank+tier, data = df)
summary(twoway)
##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## state          50  51004114  1020082   284.7 <2e-16 ***
## rank            2 150993010 75496505 21071.8 <2e-16 ***
## tier            2  61397872 30698936  8568.3 <2e-16 ***
## Residuals   30113 107889768     3583                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1152 observations deleted due to missingness

TASK 3

df2 <- read_csv("Data/Juniper_Oils.csv")  %>% pivot_longer(c("alpha-pinene","para-cymene","alpha-terpineol","cedr-9-ene","alpha-cedrene","beta-cedrene","cis-thujopsene","alpha-himachalene","beta-chamigrene","cuparene","compound 1","alpha-chamigrene","widdrol","cedrol","beta-acorenol","alpha-acorenol","gamma-eudesmol","beta-eudesmol","alpha-eudesmol","cedr-8-en-13-ol","cedr-8-en-15-ol","compound 2","thujopsenal"),names_to = "chem_comp",values_to = "concentration") %>% clean_names()
## Rows: 34 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): SampleID, Project, Amplicon, Tree_Species, Field_Office, BLM_Fire_...
## dbl (34): BurnYear, Latitude, Longitude, alpha-pinene, para-cymene, alpha-te...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df2)
## Rows: 782
## Columns: 20
## $ sample_id              <chr> "LOG-16S-SL12", "LOG-16S-SL12", "LOG-16S-SL12",…
## $ project                <chr> "JuniperLogs", "JuniperLogs", "JuniperLogs", "J…
## $ amplicon               <chr> "16S", "16S", "16S", "16S", "16S", "16S", "16S"…
## $ tree_species           <chr> "Juniperus osteosperma", "Juniperus osteosperma…
## $ burn_year              <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018,…
## $ latitude               <dbl> 41.5719, 41.5719, 41.5719, 41.5719, 41.5719, 41…
## $ longitude              <dbl> -113.7488, -113.7488, -113.7488, -113.7488, -11…
## $ field_office           <chr> "Salt_Lake_3", "Salt_Lake_3", "Salt_Lake_3", "S…
## $ blm_fire_name          <chr> "Ridge", "Ridge", "Ridge", "Ridge", "Ridge", "R…
## $ tracking_number        <chr> "#5276 (2018)", "#5276 (2018)", "#5276 (2018)",…
## $ yield_percent          <dbl> 0.18, 0.18, 0.18, 0.18, 0.18, 0.18, 0.18, 0.18,…
## $ bolt_surface_area_cm2  <dbl> 1687, 1687, 1687, 1687, 1687, 1687, 1687, 1687,…
## $ raw_exit_holes_per_cm2 <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,…
## $ raw_exit_holes         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ living_larvae          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ chem_total             <dbl> 87.7, 87.7, 87.7, 87.7, 87.7, 87.7, 87.7, 87.7,…
## $ chem_mean              <dbl> 3.813043, 3.813043, 3.813043, 3.813043, 3.81304…
## $ years_since_burn       <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ chem_comp              <chr> "alpha-pinene", "para-cymene", "alpha-terpineol…
## $ concentration          <dbl> 0.6, 0.4, 2.3, 0.2, 1.9, 1.2, 25.9, 0.2, 0.4, 1…

Task 4: Graph

df2 %>% ggplot(aes(x=years_since_burn,y=concentration)) + geom_smooth() + facet_wrap(~chem_comp,scales = "free_y")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Task 5: generalized linear model

library(broom)
library(stringr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
mod1 <- glm(data=df2,formula =  concentration ~  years_since_burn + chem_comp)

df3 <- tidy(mod1,conf.level = TRUE)
df4<- df3 %>% filter(df3$p.value<.05) 

df5 <- df4 %>% 
transform(term=str_replace(term,"chem_comp",""))

kable(x = df5,align = "c",caption ="Statistically significant compounds",col.names =c("compound", "estimate" ,"std.error","statistic", "p.value"))
Statistically significant compounds
compound estimate std.error statistic p.value
alpha-cedrene 10.623529 0.8993768 11.812101 0.0000000
beta-cedrene 2.776471 0.8993768 3.087105 0.0020948
beta-eudesmol 1.788235 0.8993768 1.988305 0.0471372
cedr-8-en-13-ol 5.552941 0.8993768 6.174210 0.0000000
cedrol 19.823529 0.8993768 22.041407 0.0000000
cis-thujopsene 21.298824 0.8993768 23.681759 0.0000000
compound 1 1.817647 0.8993768 2.021007 0.0436301
cuparene 1.923529 0.8993768 2.138736 0.0327760
thujopsenal 1.788235 0.8993768 1.988305 0.0471372
widdrol 6.135294 0.8993768 6.821718 0.0000000